In [37]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
import warnings
# Ignore all warnings
warnings.simplefilter("ignore")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as px1
import plotly.graph_objects as go
px1.init_notebook_mode()
In [39]:
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.express as px
Data Summary¶
In [40]:
d1=pd.read_csv(r"D:\Empty\Kaggle\datasets\Billionaires Statistics Dataset.csv")
d1.head(6)
Out[40]:
| rank | finalWorth | category | personName | age | country | city | source | industries | countryOfCitizenship | ... | cpi_change_country | gdp_country | gross_tertiary_education_enrollment | gross_primary_education_enrollment_country | life_expectancy_country | tax_revenue_country_country | total_tax_rate_country | population_country | latitude_country | longitude_country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 211000 | Fashion & Retail | Bernard Arnault & family | 74.0 | France | Paris | LVMH | Fashion & Retail | France | ... | 1.1 | $2,715,518,274,227 | 65.6 | 102.5 | 82.5 | 24.2 | 60.7 | 67059887.0 | 46.227638 | 2.213749 |
| 1 | 2 | 180000 | Automotive | Elon Musk | 51.0 | United States | Austin | Tesla, SpaceX | Automotive | United States | ... | 7.5 | $21,427,700,000,000 | 88.2 | 101.8 | 78.5 | 9.6 | 36.6 | 328239523.0 | 37.090240 | -95.712891 |
| 2 | 3 | 114000 | Technology | Jeff Bezos | 59.0 | United States | Medina | Amazon | Technology | United States | ... | 7.5 | $21,427,700,000,000 | 88.2 | 101.8 | 78.5 | 9.6 | 36.6 | 328239523.0 | 37.090240 | -95.712891 |
| 3 | 4 | 107000 | Technology | Larry Ellison | 78.0 | United States | Lanai | Oracle | Technology | United States | ... | 7.5 | $21,427,700,000,000 | 88.2 | 101.8 | 78.5 | 9.6 | 36.6 | 328239523.0 | 37.090240 | -95.712891 |
| 4 | 5 | 106000 | Finance & Investments | Warren Buffett | 92.0 | United States | Omaha | Berkshire Hathaway | Finance & Investments | United States | ... | 7.5 | $21,427,700,000,000 | 88.2 | 101.8 | 78.5 | 9.6 | 36.6 | 328239523.0 | 37.090240 | -95.712891 |
| 5 | 6 | 104000 | Technology | Bill Gates | 67.0 | United States | Medina | Microsoft | Technology | United States | ... | 7.5 | $21,427,700,000,000 | 88.2 | 101.8 | 78.5 | 9.6 | 36.6 | 328239523.0 | 37.090240 | -95.712891 |
6 rows × 35 columns
In [41]:
data=d1.copy()
Data Exploration¶
In [42]:
print(f"The shape of the DataFrame is:{d1.shape}")
print(f"The size of the DataFrame is:{d1.size}")
The shape of the DataFrame is:(2640, 35) The size of the DataFrame is:92400
In [43]:
d1.columns
Out[43]:
Index(['rank', 'finalWorth', 'category', 'personName', 'age', 'country',
'city', 'source', 'industries', 'countryOfCitizenship', 'organization',
'selfMade', 'status', 'gender', 'birthDate', 'lastName', 'firstName',
'title', 'date', 'state', 'residenceStateRegion', 'birthYear',
'birthMonth', 'birthDay', 'cpi_country', 'cpi_change_country',
'gdp_country', 'gross_tertiary_education_enrollment',
'gross_primary_education_enrollment_country', 'life_expectancy_country',
'tax_revenue_country_country', 'total_tax_rate_country',
'population_country', 'latitude_country', 'longitude_country'],
dtype='object')
- Numerical-rank,finalworth,age,
- Categorial-category,country,city,industries,organization,selfmade,status,gender
- Mixed- personName,source,birthdate
In [44]:
d1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2640 entries, 0 to 2639 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 rank 2640 non-null int64 1 finalWorth 2640 non-null int64 2 category 2640 non-null object 3 personName 2640 non-null object 4 age 2575 non-null float64 5 country 2602 non-null object 6 city 2568 non-null object 7 source 2640 non-null object 8 industries 2640 non-null object 9 countryOfCitizenship 2640 non-null object 10 organization 325 non-null object 11 selfMade 2640 non-null bool 12 status 2640 non-null object 13 gender 2640 non-null object 14 birthDate 2564 non-null object 15 lastName 2640 non-null object 16 firstName 2637 non-null object 17 title 339 non-null object 18 date 2640 non-null object 19 state 753 non-null object 20 residenceStateRegion 747 non-null object 21 birthYear 2564 non-null float64 22 birthMonth 2564 non-null float64 23 birthDay 2564 non-null float64 24 cpi_country 2456 non-null float64 25 cpi_change_country 2456 non-null float64 26 gdp_country 2476 non-null object 27 gross_tertiary_education_enrollment 2458 non-null float64 28 gross_primary_education_enrollment_country 2459 non-null float64 29 life_expectancy_country 2458 non-null float64 30 tax_revenue_country_country 2457 non-null float64 31 total_tax_rate_country 2458 non-null float64 32 population_country 2476 non-null float64 33 latitude_country 2476 non-null float64 34 longitude_country 2476 non-null float64 dtypes: bool(1), float64(14), int64(2), object(18) memory usage: 704.0+ KB
In [45]:
d1.isnull().sum()
Out[45]:
rank 0 finalWorth 0 category 0 personName 0 age 65 country 38 city 72 source 0 industries 0 countryOfCitizenship 0 organization 2315 selfMade 0 status 0 gender 0 birthDate 76 lastName 0 firstName 3 title 2301 date 0 state 1887 residenceStateRegion 1893 birthYear 76 birthMonth 76 birthDay 76 cpi_country 184 cpi_change_country 184 gdp_country 164 gross_tertiary_education_enrollment 182 gross_primary_education_enrollment_country 181 life_expectancy_country 182 tax_revenue_country_country 183 total_tax_rate_country 182 population_country 164 latitude_country 164 longitude_country 164 dtype: int64
Percentage of null values
In [46]:
missing_df=d1.isnull().sum().to_frame().rename(columns={0:"Missing Values"})
missing_df["Percentage of missing values"]=(round((d1.isnull().sum()*100)/(len(d1)),2).astype(str)+"%")
missing_df
Out[46]:
| Missing Values | Percentage of missing values | |
|---|---|---|
| rank | 0 | 0.0% |
| finalWorth | 0 | 0.0% |
| category | 0 | 0.0% |
| personName | 0 | 0.0% |
| age | 65 | 2.46% |
| country | 38 | 1.44% |
| city | 72 | 2.73% |
| source | 0 | 0.0% |
| industries | 0 | 0.0% |
| countryOfCitizenship | 0 | 0.0% |
| organization | 2315 | 87.69% |
| selfMade | 0 | 0.0% |
| status | 0 | 0.0% |
| gender | 0 | 0.0% |
| birthDate | 76 | 2.88% |
| lastName | 0 | 0.0% |
| firstName | 3 | 0.11% |
| title | 2301 | 87.16% |
| date | 0 | 0.0% |
| state | 1887 | 71.48% |
| residenceStateRegion | 1893 | 71.7% |
| birthYear | 76 | 2.88% |
| birthMonth | 76 | 2.88% |
| birthDay | 76 | 2.88% |
| cpi_country | 184 | 6.97% |
| cpi_change_country | 184 | 6.97% |
| gdp_country | 164 | 6.21% |
| gross_tertiary_education_enrollment | 182 | 6.89% |
| gross_primary_education_enrollment_country | 181 | 6.86% |
| life_expectancy_country | 182 | 6.89% |
| tax_revenue_country_country | 183 | 6.93% |
| total_tax_rate_country | 182 | 6.89% |
| population_country | 164 | 6.21% |
| latitude_country | 164 | 6.21% |
| longitude_country | 164 | 6.21% |
In [47]:
d1["country"].unique()
d1["country"].nunique()
Out[47]:
78
In [48]:
d1.describe().transpose()
Out[48]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| rank | 2640.0 | 1.289159e+03 | 7.396937e+02 | 1.000000 | 6.590000e+02 | 1.312000e+03 | 1.905000e+03 | 2.540000e+03 |
| finalWorth | 2640.0 | 4.623788e+03 | 9.834241e+03 | 1000.000000 | 1.500000e+03 | 2.300000e+03 | 4.200000e+03 | 2.110000e+05 |
| age | 2575.0 | 6.514019e+01 | 1.325810e+01 | 18.000000 | 5.600000e+01 | 6.500000e+01 | 7.500000e+01 | 1.010000e+02 |
| birthYear | 2564.0 | 1.957183e+03 | 1.328252e+01 | 1921.000000 | 1.948000e+03 | 1.957000e+03 | 1.966000e+03 | 2.004000e+03 |
| birthMonth | 2564.0 | 5.740250e+00 | 3.710085e+00 | 1.000000 | 2.000000e+00 | 6.000000e+00 | 9.000000e+00 | 1.200000e+01 |
| birthDay | 2564.0 | 1.209984e+01 | 9.918876e+00 | 1.000000 | 1.000000e+00 | 1.100000e+01 | 2.100000e+01 | 3.100000e+01 |
| cpi_country | 2456.0 | 1.277552e+02 | 2.645295e+01 | 99.550000 | 1.172400e+02 | 1.172400e+02 | 1.250800e+02 | 2.885700e+02 |
| cpi_change_country | 2456.0 | 4.364169e+00 | 3.623763e+00 | -1.900000 | 1.700000e+00 | 2.900000e+00 | 7.500000e+00 | 5.350000e+01 |
| gross_tertiary_education_enrollment | 2458.0 | 6.722567e+01 | 2.134343e+01 | 4.000000 | 5.060000e+01 | 6.560000e+01 | 8.820000e+01 | 1.366000e+02 |
| gross_primary_education_enrollment_country | 2459.0 | 1.028585e+02 | 4.710977e+00 | 84.700000 | 1.002000e+02 | 1.018000e+02 | 1.026000e+02 | 1.421000e+02 |
| life_expectancy_country | 2458.0 | 7.812282e+01 | 3.730099e+00 | 54.300000 | 7.700000e+01 | 7.850000e+01 | 8.090000e+01 | 8.420000e+01 |
| tax_revenue_country_country | 2457.0 | 1.254624e+01 | 5.368625e+00 | 0.100000 | 9.600000e+00 | 9.600000e+00 | 1.280000e+01 | 3.720000e+01 |
| total_tax_rate_country | 2458.0 | 4.396334e+01 | 1.214530e+01 | 9.900000 | 3.660000e+01 | 4.120000e+01 | 5.910000e+01 | 1.063000e+02 |
| population_country | 2476.0 | 5.102053e+08 | 5.542447e+08 | 38019.000000 | 6.683440e+07 | 3.282395e+08 | 1.366418e+09 | 1.397715e+09 |
| latitude_country | 2476.0 | 3.490359e+01 | 1.700350e+01 | -40.900557 | 3.586166e+01 | 3.709024e+01 | 4.046367e+01 | 6.192411e+01 |
| longitude_country | 2476.0 | 1.258316e+01 | 8.676299e+01 | -106.346771 | -9.571289e+01 | 1.045153e+01 | 1.041954e+02 | 1.748860e+02 |
In [49]:
d1["country"].duplicated().sum()
Out[49]:
2561
Data Accessing¶
- Numerical-rank,finalworth,age,
- Categorial-category,country,city,industries,organization,selfmade,status,gender
- Mixed- personName,source,birthdate
Defining the Problems¶
Wealth Distribution:
- What is the distribution of wealth among different categories?
- Are there certain industries that dominate the top ranks in terms of wealth?
Geographical Analysis:
- Which countries have the highest representation among the wealthiest individuals?
- Is there a correlation between the country of citizenship and the individual's current residence?
Age and Wealth:
- How does the age of billionaires correlate with their wealth?
- Are there notable differences in the wealth distribution among different age groups?
Industry Insights:
- Which industries have the highest concentration of billionaires?
- Are there specific industries that consistently appear among the top ranks?
Source of Wealth:
- What are the primary sources of wealth for the billionaires in the dataset?
- Is there a relationship between the source of wealth and the total worth of an individual?
Country-specific Analysis:
- Can we identify any economic indicators (e.g., GDP, tax rates) that correlate with the number or wealth of billionaires in a country?
- How does life expectancy in a country correlate with the wealth of its billionaires?
Population and Wealth:
- Is there a correlation between the population of a country and the number of billionaires it has?
- How does the wealth of individuals compare across countries with different population sizes?
Spatial Analysis:
- Can we visualize the geographical distribution of billionaires on a world map?
- Are there clusters or patterns in the distribution of billionaires based on latitude and longitude?
Gender Representation:
- What is the gender distribution among the billionaires in the dataset?
- Are there notable differences in wealth between male and female billionaires?
Educational Insights:
- Is there a correlation between the level of education (e.g., tertiary education enrollment) in a country and the wealth of its billionaires?
In [50]:
d1.columns
Out[50]:
Index(['rank', 'finalWorth', 'category', 'personName', 'age', 'country',
'city', 'source', 'industries', 'countryOfCitizenship', 'organization',
'selfMade', 'status', 'gender', 'birthDate', 'lastName', 'firstName',
'title', 'date', 'state', 'residenceStateRegion', 'birthYear',
'birthMonth', 'birthDay', 'cpi_country', 'cpi_change_country',
'gdp_country', 'gross_tertiary_education_enrollment',
'gross_primary_education_enrollment_country', 'life_expectancy_country',
'tax_revenue_country_country', 'total_tax_rate_country',
'population_country', 'latitude_country', 'longitude_country'],
dtype='object')
- Wealth Distribution:
- What is the distribution of wealth among different categories?
- Are there certain industries that dominate the top ranks in terms of wealth?
Conclusion:¶
Top Industries Ranking in Wealth¶
- Automotive.
- Technology
- Telecom
- Logistics
- Metals and Mining
Insights:¶
- These industries dominate the top ranks in wealth, emphasizing their economic significance and success.
- The financial achievements within each sector contribute to the overall diversity of wealth distribution.
In [51]:
sorted_df=d1.sort_values(by='finalWorth', ascending=False)
sns.barplot(data=sorted_df,x="finalWorth",y="category",palette='viridis',errorbar=None)
plt.title("Wealth distribution based on industries or categories")
Out[51]:
Text(0.5, 1.0, 'Wealth distribution based on industries or categories')
2. Industry Insights:¶
- Which industries have the highest concentration of billionaires?
- Which industries have the highest representation among the individuals in the dataset?
Conclusion:¶
- Food and beverages,Finance and investments, manufacturing, technology, fashion and retail,energy, healthcare have the highest concentration of billionaires.
- Finance and investments, manufacturing, technology, fashion and retail are the sectors which contains most male billionaires.
- Whereas the female billionaires are mainly seen in fashion and retail, food and beverages, and manufacturing industries.
- Based on the analysis, it can be concluded that the majority of billionaires fall within the age bracket of 50 to 70.
In [52]:
sns.displot(data=d1,x="industries",hue="gender",palette="viridis",element="step")
plt.xticks(rotation="vertical")
plt.title("Industries with highest representation")
Out[52]:
Text(0.5, 1.0, 'Industries with highest representation')
In [53]:
d1["age"].fillna(d1["age"].mode()[0],inplace=True)
In [54]:
data["age"].fillna(data["age"].mean(),inplace=True)
In [55]:
px.histogram(d1,x="age")
In [56]:
print("The skewness of age column is:",data["age"].skew())
sns.displot(data=data,x="age",kind="hist")
The skewness of age column is: -0.07903682421854166
Out[56]:
<seaborn.axisgrid.FacetGrid at 0x102e50eb150>
3. Geographical Analysis:¶
- Which countries have the highest representation among the wealthiest individuals?
- Is there a correlation between the country of citizenship and the individual's current residence?
Conclusion:¶
- The top five countries with most billionaires are UNited States, China, India, Germany, and United Kingdom.
- The high Cramér's V value suggests that there is a substantial relationship between the country of residence and the country of citizenship in the dataset.
In [57]:
d1["country"].value_counts().head(5).to_frame()
Out[57]:
| count | |
|---|---|
| country | |
| United States | 754 |
| China | 523 |
| India | 157 |
| Germany | 102 |
| United Kingdom | 82 |
In [58]:
d1["country"].value_counts()
Out[58]:
country
United States 754
China 523
India 157
Germany 102
United Kingdom 82
...
Portugal 1
Georgia 1
Eswatini (Swaziland) 1
Uzbekistan 1
Armenia 1
Name: count, Length: 78, dtype: int64
In [59]:
d1["countryOfCitizenship"].value_counts()
Out[59]:
countryOfCitizenship
United States 735
China 491
India 169
Germany 126
Russia 104
...
Belize 1
Eswatini (Swaziland) 1
Venezuela 1
Algeria 1
Panama 1
Name: count, Length: 77, dtype: int64
In [60]:
from scipy.stats import chi2_contingency
contingency_table = pd.crosstab(d1["country"], d1["countryOfCitizenship"])
chi2, p, _, _ = chi2_contingency(contingency_table)
print(f"Chi-square value: {chi2}, p-value: {p}")
Chi-square value: 132750.7197374479, p-value: 0.0
In [61]:
def cramers_v(confusion_matrix):
chi2 = chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))
# Calculate Cramér's V
association_strength = cramers_v(contingency_table)
print(f"Cramér's V: {association_strength}")
Cramér's V: 0.8185133547927517
4.Age and Wealth:¶
- How does the age of billionaires correlate with their wealth?
- Are there notable differences in the wealth distribution among different age groups?
Conclusion:¶
- A correlation coefficient of 0.069 is close to zero, suggesting a weak positive correlation. This means that as one variable (age) increases, the other variable (final worth) tends to increase slightly, but the relationship is not strong.
- Yes, there is a notable difference in the wealth distribution i.e., the people of age less than 50 have net worth less when compared to that of people of age greater than 50.
In [62]:
d1.age.fillna(d1.age.mode()[0],inplace=True)
d1.age.isnull().sum()
Out[62]:
0
In [63]:
from scipy.stats import pearsonr
corr, _ = pearsonr(d1.age,d1.finalWorth)
print('Pearsons correlation: %.3f' % corr)
Pearsons correlation: 0.069
In [64]:
px.bar(d1,x="age",y="finalWorth")
In [65]:
d1.source.nunique()
Out[65]:
906
In [66]:
d1.source.value_counts()
Out[66]:
source
Real estate 151
Investments 92
Diversified 91
Pharmaceuticals 85
Software 63
...
Chemical industry 1
Readymade garments 1
Stock brokerage 1
Nutrition, wellness products 1
Tyre manufacturing machinery 1
Name: count, Length: 906, dtype: int64
In [70]:
px.histogram(d1,x="source")
In [ ]:
from scipy.stats import kurtosis,skew
kur=kurtosis(d1["age"])
kur
Out[ ]:
-0.1387691447141699
In [ ]:
skew(d1["age"])
Out[ ]:
-0.051088117525369715
In [ ]:
d1.age.mean()
Out[ ]:
65.01363636363637
In [ ]:
sns.displot(data=d1,x="age",kind="kde")
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x102e04759d0>
In [ ]:
kurtosis(d1["finalWorth"])
Out[ ]:
144.7967937167552
In [ ]:
skew(d1["finalWorth"])
Out[ ]:
10.006677577919163
In [ ]:
sns.displot(data=d1,x="finalWorth",kind="kde")
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x102e079e610>
In [ ]:
!jupyter nbconvert --to html billionaires-data-visualization.ipynb
[NbConvertApp] Converting notebook billionaires-data-visualization.ipynb to html
C:\Users\Naveen\AppData\Local\Programs\Python\Python311\Lib\site-packages\nbformat\__init__.py:93: MissingIDFieldWarning: Code cell is missing an id field, this will become a hard error in future nbformat versions. You may want to use `normalize()` on your notebooks before validations (available since nbformat 5.1.4). Previous versions of nbformat are fixing this issue transparently, and will stop doing so in the future.
validate(nb)
C:\Users\Naveen\AppData\Local\Programs\Python\Python311\share\jupyter\nbconvert\templates\base\display_priority.j2:32: UserWarning: Your element with mimetype(s) dict_keys(['application/vnd.plotly.v1+json']) is not able to be represented.
{%- elif type == 'text/vnd.mermaid' -%}
[NbConvertApp] WARNING | Alternative text is missing on 5 image(s).
[NbConvertApp] Writing 4163755 bytes to billionaires-data-visualization.html
In [ ]: